***************
* This file generates the table on the occupational coverage of ads posted on Job-Room
* Author: Daniel Kopp
***************

clear
clear matrix
clear mata

set maxvar 8000
set seed 1

use  "data_processed\oste_occupations.dta"

drop abschluss_inland abschluss_ausland		// abschluss_ausland has only missings and 93% of abschluss_inland are missing as well

* Drop variables we don't need
drop ergaenzende_angaben 

*******
* Drop duplicates
*******

* Sometimes the only variation within the ads comes from different levels of experience or education.
* We take the maximum experience and education level within ads if there is variation
foreach var of varlist  cod_erfahrung cod_ausbildung  {
	bys oste_id_avam : egen max_`var' = max(`var') if `var'!=.
	drop `var' 
	rename max_`var' `var'
}

duplicates drop oste_id_avam cod_b_avam cod_isco5  cod_erfahrung cod_ausbildung 	, force

duplicates drop oste_id_avam cod_b_avam  	, force		// 3 additional variables dropped because there has been an error in the original data (wrong assignment of isco to avam code)

drop cod_erfahrung cod_ausbildung cod_b_avam
		
tostring  cod_isco5, gen(cod_isco5_str)

gen 	isco4_str 	= substr(cod_isco5_str,1,strlen(cod_isco5_str)-1)
replace isco4_str = "-1" if isco4_str=="-"
destring isco4_str	, gen(isco_4)
drop isco4_str cod_isco5_str cod_isco5

* We drop all observations with the same ad-id and the same isco-4 digit occupation
duplicates drop oste_id_avam isco_4  	, force	

distinct oste_id_avam 	// 563638
bys oste_id_avam: egen diff_isco4 = nvals(isco_4)
bys oste_id_avam: gen n = _n

tab diff_isco4 if n==1,m		// 82.2% of the ads with only 1 isco-4 occupation, 13.6% with 2 and 4.2% with more than 2 	
count if isco_4==-1 	// 1'648

* Gen Isco 3 digit
recode isco_4 (-1=.)
tostring isco_4, gen(isco_4_str)
gen 	 isco_3_str 	= substr(isco_4_str,1,strlen(isco_4_str)-1)
destring isco_3_str	, gen(isco_3)
drop isco_4_str  

* Gen Isco 2 digit
gen 	 isco_2_str 	= substr(isco_3_str,1,strlen(isco_3_str)-1)
destring isco_2_str	, gen(isco_2)
drop isco_3_str  

* Gen Isco 1 digit
gen 	 isco_1_str 	= substr(isco_2_str,1,strlen(isco_2_str)-1)
destring isco_1_str	, gen(isco_1)
drop isco_2_str  isco_1_str
replace isco_1 = 0 if inrange(isco_2,1,3)

recode isco_1 (0=.)

forvalues i = 1/4 {
cap run "Help_files\label_isco08_`i'.do"
label values isco_`i' isco08_`i'_lab
}

tab isco_2
foreach i in  20 30  70  {
	recode isco_2 (`i'=.)
}

******************************
* Merge other data to the dataset
******************************

cap drop _merge
merge m:1 oste_id_avam using "data_processed\oste.dta" 
keep if _merge==3
drop _merge

* We merge the number of employees per ISCO 4-digit occupation from the Structural Survey ("Strukturerhebung") to our dataset. 
* We then have the full set of isco 4-digit occupations (498, or 495 if we ignore the 3-digit codes - see below). 
gen s_isco_08_4 = isco_4
merge m:1 s_isco_08_4 using "Misc_files\employment_by_isco4.dta", keepusing(employed_isco4)

drop if s_isco_08_4==-7
drop if s_isco_08_4==-9

tab s_isco_08_4 if _merge==1  // 10 occupations that are in our sample but not in the Strukturerhebung (apparently because there are no workers employed in those occupations: 2353, 2354, 2510, 3253, 4222, 4320, 8150, 8172, 9213, 9411) 

* Drop armed forces
drop if s_isco_08_4==110
drop if s_isco_08_4==210
drop if s_isco_08_4==310

bys s_isco_08_4: gen n_isco4 = _n

******************************
* Calculate the share of job ads on Job-Room by occupation and the corresponding employment share of this occupation
******************************

egen employed_total = total(employed_isco4) if n_isco4==1 & s_isco_08_4!=.
tab employed_total		// Total:   4,027,732 workers for which we know in which occupation they work (but some of them are assigned to very unspecific occupations, e.g. isco-4==1000 or isco-4==2000 - below I drop these occupations)
drop employed_total

* Gen number of total workers employed in every isco-1 occupation	
preserve
	collapse isco_1 employed_isco4, by(s_isco_08_4)
	
	drop if s_isco_08_4==.
	
	* We drop those categories that don't exist on a 4 digit level (i.e. all occupations with 3 zeros at the end)
	forvalues i = 1/9 {
	    drop if s_isco_08_4==`i'000
	}
	
	* Gen isco-1 classification that encompasses all isco-4 observations
	gen 		isco_1_compl = s_isco_08_4
	tostring 	isco_1_compl, replace
	replace 	isco_1_compl = substr(isco_1_compl,1,1)
	destring 	isco_1_compl, replace
	
	collapse isco_1 (sum) employed_isco1=employed_isco4, by(isco_1_compl)
	drop if isco_1==. 

	tempfile formerge_isco1
	save `formerge_isco1', replace		
restore
drop if _merge==2
drop _merge

merge 	m:1 isco_1 using `formerge_isco1'
drop _merge

bys isco_1: gen n_isco1 = _n
egen 	employed_total_ = total(employed_isco1) if n_isco1==1	
egen 	employed_total  = mean(employed_total_) 				//  3'772'255
sum employed_total
drop employed_total_

gen share_empl_isco1 = employed_isco1/employed_total
tabstat share_empl_isco1, by(isco_1)
		
* Merge the data with the number of all ads published on Job-Room by isco-1:

merge m:1 isco_1 using "Misc_files\job_ads_by_isco1_api.dta"

sort dat_freigabe_avam
		
* Save table with the share job ads per isco 1-digit occupation and with the respective employment share in the economy
eststo clear
preserve
drop if isco_1==. 
bys isco_1: egen obs_isco1 = count(oste_id_avam) 
gen total_obs = _N
keep if inrange(dat_freigabe_avam,td(31dec2019),td(01jun2021))
gen total_obs_restr = _N
collapse share_obs_isco1_api total_obs total_obs_restr share_empl_isco1 obs_isco1 (count) obs_isco1_restr=oste_id_avam, by(isco_1)
	gen share_obs_isco1 = obs_isco1/total_obs
	gen share_obs_isco1_restr =  obs_isco1_restr/total_obs_restr
estpost tabstat share_obs_isco1 , by(isco_1) notot
estimates store obs_per_isco1
estpost tabstat share_obs_isco1_restr , by(isco_1) notot
estimates store obs_per_isco1_restr
estpost tabstat share_obs_isco1_api, by(isco_1) notot
estimates store obs_per_isco1_api
restore

esttab obs_per_isco1 obs_per_isco1_restr obs_per_isco1_api , cells("mean(fmt(%9.3f))") label noobs collabels(none) varlabels(`e(labels)')  ///
		mtitle("\makecell{Ads directly \\ reported \\ 07/2018--06/2021}" "\makecell{Ads directly \\ reported \\ 01/2020--05/2021}" "\makecell{All  ads \\ \\ 01/2020--05/2021}"  ) ///		
		title(Share of observations and employment per occupation) 
esttab obs_per_isco1 obs_per_isco1_restr obs_per_isco1_api  using "$results_part_time\table_d4.tex", replace ///
		cells("mean(fmt(%9.3f))") label noobs collabels(none) varlabels(`e(labels)')  frag ///
		mtitle("\makecell{Ads directly \\ reported \\ 07/2018--06/2021}" "\makecell{Ads directly \\ reported \\ 01/2020--05/2021}" "\makecell{All  ads \\ \\ 01/2020--05/2021}"  ) ///	
		title(Share of observations and employment per occupation)
		
		

		